K Nearest Neighbour Project
Since KNN is such a simple algorithm, we will just use this “Project” as a simple exercise to test your understanding of the implementation of KNN.
Iris Dataset
We’ll use the famous iris data set for this project. It’s a small data set with flower features that can be used to attempt to predict the species of an iris flower.
if(!require(ggpubr)){
install.packages("ggpubr")
library(ggpubr)
}
if(!require(ISLR2)){
install.packages("ISLR2")
library(ISLR2)
}
if(!require(ggplot2)){
install.packages("ggplot2")
library(ggplot2)
}
if(!require(ggdark)){
install.packages("ggdark")
library(ggdark)
}
if(!require(caTools)){
install.packages("caTools")
library(caTools)
}
if(!require(class)){
install.packages("class")
library(class)
}
if(!require(plotly)){
install.packages("plotly")
library(plotly)
}
Loading iris dataset
df <- iris
head(df)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
- Dataset structure
str(df)
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
- Dataset summary
summary(df)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
## Species
## setosa :50
## versicolor:50
## virginica :50
##
##
##
Standardize data
- The iris data set has all its features in the same order of magnitude, but its good practice (especially with KNN) to standardize features in your data.
Use scale() to standardize the feature columns
of the iris dataset. Set this standardized version of the data as a new
variable.
standard.feature <- scale( df[1:4])
- Checking that the scaling worked by checking the variance of one of the new columns.
var(standard.feature[,1])
## [1] 1
- Join the standardized data with the response/target/label column (the column with the species names.
final.data <- cbind(standard.feature,df[5])
head(final.data)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 -0.8976739 1.01560199 -1.335752 -1.311052 setosa
## 2 -1.1392005 -0.13153881 -1.335752 -1.311052 setosa
## 3 -1.3807271 0.32731751 -1.392399 -1.311052 setosa
## 4 -1.5014904 0.09788935 -1.279104 -1.311052 setosa
## 5 -1.0184372 1.24503015 -1.335752 -1.311052 setosa
## 6 -0.5353840 1.93331463 -1.165809 -1.048667 setosa
Train-test split
set.seed(101)
sample <- sample.split(final.data$Species, SplitRatio = 0.70)
train <- subset(final.data, sample = TRUE)
test <- subset(final.data, sample = FALSE)
Build a KNN model.
# Fit KNN model
k <- 10
knn_model <- knn(train[, 1:2], test[, 1:2], train[, 5], k = k)
# Combine predictions with test data
test_with_pred <- cbind(test, Predicted_Species = knn_model)
# Plotting using ggplot2 and ggpubr
ggplot(test_with_pred, aes(x = Sepal.Width, y = Sepal.Length, color = Predicted_Species)) +
geom_point(size = 3, alpha = 0.9) +
stat_ellipse(aes(fill = Predicted_Species, color = Predicted_Species), geom = "polygon", alpha = 0.4,color = "white") +
labs(title = paste("KNN Cluster Probability Ellipses (k = ", k, ")", sep = ""),
x = "Sepal.Length", y = "Sepal.Width", fill = "Predicted Species", color = "Predicted Species") +
scale_color_manual(values = c("setosa" = "blue", "versicolor" = "green", "virginica" = "red")) +
scale_fill_manual(values = c("setosa" = "lightblue", "versicolor" = "lightgreen", "virginica" = "lightcoral")) +
dark_theme_light()
## Inverted geom defaults of fill and color/colour.
## To change them back, use invert_geom_defaults().
# Fit KNN model
k <- 5
knn_model <- knn(train[, 1:2], test[, 1:2], train[, 5], k = k)
# Combine predictions with test data
test_with_pred <- cbind(test, Predicted_Species = knn_model)
# Plotting using ggplot2 and ggpubr
knn.plt <- ggplot() +
geom_point(data = train, aes(x = Sepal.Width, y = Sepal.Length, color = "Train"), size = 3, alpha = 0.7) +
geom_point(data = test_with_pred, aes(x = Sepal.Width, y = Sepal.Length, color = Predicted_Species), size = 3, alpha = 0.7) +
stat_ellipse(data = train, aes(x = Sepal.Width, y = Sepal.Length, fill = "Train"), geom = "polygon", alpha = 0.2, color = "black") +
stat_ellipse(data = test_with_pred, aes(x = Sepal.Width, y = Sepal.Length, fill = Predicted_Species), geom = "polygon", alpha = 0.2, color = "black") +
labs(title = paste("KNN Cluster Probability Ellipses (k =", k, ")", sep = ""),
x = "Sepal.Length", y = "Sepal.Width", fill = "Dataset") +
scale_color_manual(values = c("Train" = "blue", "setosa" = "green", "versicolor" = "yellow", "virginica" = "red")) +
scale_fill_manual(values = c("Train" = "lightblue", "setosa" = "lightgreen", "versicolor" = "lightcoral", "virginica" = "lightyellow")) +
dark_theme_light()
ggplty <- ggplotly(knn.plt)
ggplty